info_healthy) и пациентах с СРК
(info_ibs)info_healthy <- read_xlsx ("data/raw/final_health_statistic.xlsx")
info_ibs <- read_xlsx ("data/raw/final_ibs_141_statistic.xlsx")
info_healthy и
info_ibs## Уникальные переменные в info_ibs:
## Main_Disease
## Weight_kg
## Height_cm
## BMI
## Diet_type
## Diet_duration
## Additive_usage
## Уникальные переменные в info_healthy:
## Drugs
info_healthy и info_ibs в
единый датасет combined_info c редактированием
содержимогоcombined_info <- info_healthy %>%
bind_rows(info_ibs) %>%
mutate (
BMI_min = ifelse (is.na (BMI_min), round (Weight_min /(Height_max/100 * Height_max/100), 2), BMI_min),
BMI_max = ifelse (is.na (BMI_max), round (Weight_max /(Height_min/100 * Height_min/100), 2), BMI_max)
) %>%
unite("BMI_range", BMI_min, BMI_max, sep = "-", na.rm = TRUE) %>%
unite ("Age_range", Age_min, Age_max, sep = "-", na.rm = TRUE) %>%
mutate(
Age_range = case_when(
Age_range == "18-40" | Age_range == "23-28" | Age_range == "16-42" | Age_range == "21-43" ~ "16-43",
Age <= 43 ~ "16-43",
Age > 43 ~ "> 43",
TRUE ~ NA_character_), #удалена группа 28-54
research_ID = sub ("research_", "", research_ID),
research_ID = case_when(
research_ID == 0 ~ 1,
research_ID == 1 ~ 2,
research_ID == 2 ~ 3,
research_ID == 3 ~ 4,
research_ID == 4 ~ 5,
research_ID == 6 ~ 6,
research_ID == 7 ~ 7),
patient_ID = row_number(),
Sex = ifelse (Sex == "mixed", NA, Sex),
Smoking = sub ("never", "Never", Smoking),
Smoking = case_when(
Smoking == "No" ~ "No",
Smoking == "Never" ~ "No",
Smoking == "Rarely (a few times/month)" ~ "Yes", #5 чел.
Smoking == "Occasionally (1-2 times/week)" ~ "Yes", #3 чел.
Smoking == "Regularly (3-5 times/week)" ~ "Yes", #1 чел.
Smoking == "Daily" ~ "Yes"), #7 чел.
Alcohol = sub ("rarely", "Rarely", Alcohol),
Alcohol = ifelse(Alcohol == "Regularly (3-5 times/week)"|
Alcohol == "Daily", #3 чел.
"Regularly (3-7 times/week)", Alcohol),
Antibiotics_usage = case_when(
Antibiotics_usage == "Month" | Antibiotics_usage == ~ "3 months" |
Antibiotics_usage == "6 months" ~ "1-6 months",
#2 чел. больных Month, 0 чел. здоровых 3 months, 0 чел. здоровых для 6 months
Antibiotics_usage == "Year" | Antibiotics_usage == "Not use" ~
"12 months/Not use"), # 0 чел. из здоровых для 6-12 months, 0 чел. больных для Not use
Hygiene = case_when(
Hygiene == "Occasionally (1-2 times/week) cosmetics" ~ "Occasionally cosmetics (1-2 times/week)",
Hygiene == "Rarely (a few times/month) cosmetics" ~ "Rarely cosmetics (a few times/month)",
TRUE ~ Hygiene),
Hygiene = case_when(
Hygiene == "Daily cosmetics"|Hygiene == "Regularly cosmetics (3-5 times/week)" ~ "Regularly (3-7 times/week)", #0 чел. больных для 3-5 times/week
Hygiene == "Occasionally cosmetics (1-2 times/week)" | Hygiene == "Rarely cosmetics (a few times/month)" ~ "Occasionally (a few-8 times/month)",
#только 2 чел. больных для 1-2 times/week
Hygiene == "Never cosmetics" ~ "Never"),
Physical_activity = sub ("regularly", "Regularly", Smoking),
BMI = ifelse (is.na(Weight_kg), BMI, Weight_kg/ (Height_cm/100 * Height_cm/100)),
BMI_range = ifelse(BMI_range == "", NA, BMI_range),
BMI_category = case_when(
BMI_range == "18-25" ~ "normal/overweight",
BMI_range == "19.21-29.29" ~ "normal/overweight",
BMI_range == "20.6-29.6" ~ "normal/overweight",
BMI_range == "21.74-28.38" ~ "normal/overweight",
BMI_range == "18.5-30.8" ~ "normal/overweight", #немного больше 30
BMI < 18.5 ~ "underweight",
BMI >= 18.5 & BMI < 30 ~ "normal/overweight",
BMI >= 30 ~ "obese")
) %>%
rename ("Cosmetics" = Hygiene ) %>%
mutate_if (is.character, as.factor) %>%
select(-c(
Instrument, # unique (combined_info$Instrument) = "Illumina MiSeq"
Isolation_source, # unique (combined_info$Isolation_source) = "faeces"
Assay_type, # unique (combined_info$Assay_type) = "AMPLICON"
Target_gene, # unique (combined_info$Target_gene) = "16S"
Main_Disease, # unique (combined_info$Main_Disease) = NA (for healthy), 141 (for ibs)
Drugs, # unique (combined_info$Drugs) = NA
Social_status, # unique (combined_info$Social_status) = NA, urban
Weight_kg, Height_cm, Weight_min, Weight_max, Height_min, Height_max, BMI_range, #использованы для создания BMI_category, уменьшения количества NA в BMI
BMI, # NA у всех здоровых
Birth_Year, # имеются значения только в тех случаях, где возраст уже известен
Pets_type # только cat и NA
))
rm (info_healthy, info_ibs)
summary (combined_info)
## research_ID patient_ID Seq_region Seq_date Health_state
## Min. :1.000 Min. : 1 V3-V4:106 Min. :2015 Disease:170
## 1st Qu.:3.000 1st Qu.: 96 V4 :229 1st Qu.:2017 Health :211
## Median :4.000 Median :191 V5-V6: 46 Median :2018
## Mean :4.194 Mean :191 Mean :2019
## 3rd Qu.:6.000 3rd Qu.:286 3rd Qu.:2021
## Max. :7.000 Max. :381 Max. :2023
##
## Age Age_range Sex Country
## Min. :19.00 > 43 : 44 female: 79 Spain :95
## 1st Qu.:27.25 16-43:242 male :112 Poland :70
## Median :34.00 NA's : 95 NA's :190 Belgium:46
## Mean :38.71 Germany:45
## 3rd Qu.:47.00 Italy :37
## Max. :76.00 Austria:28
## NA's :247 (Other):60
## Race Smoking Alcohol
## African American : 1 No :187 Never : 12
## Asian or Pacific Islander: 1 Yes : 16 Occasionally (1-2 times/week): 24
## Caucasian :116 NA's:178 Rarely (a few times/month) : 36
## Hispanic : 1 Regularly (3-7 times/week) : 15
## Other : 11 NA's :294
## NA's :251
##
## Antibiotics_usage Physical_activity Travel_period
## 1-6 months : 48 No :187 3 months: 18
## 12 months/Not use:129 Yes : 16 6 months: 10
## NA's :204 NA's:178 Month : 13
## Year : 45
## NA's :295
##
##
## Education_level Cosmetics
## Bachelor's level: 31 Never : 44
## Master's level : 28 Occasionally (a few-8 times/month): 17
## School Level : 25 Regularly (3-7 times/week) : 25
## NA's :297 NA's :295
##
##
##
## Sleep_duration Diet_type Diet_duration Additive_usage
## > 8 hours: 9 10.0: 13 3.0 : 8 22.0: 1
## 4-6 hours: 9 5.0 : 5 6.0 : 5 23.0: 1
## 6-7 hours: 31 NA's:363 NA's:368 4.0 : 9
## 7-8 hours: 38 NA's:370
## NA's :294
##
##
## BMI_category
## normal/overweight:287
## obese : 5
## underweight : 1
## NA's : 88
##
##
##
combined_info %>%
select (research_ID, Country, Seq_date, Health_state) %>%
group_by(research_ID, Country, Seq_date, Health_state) %>%
summarise(n = n()) %>%
flextable() %>% theme_box() %>%
merge_v(c ("research_ID", "Country", "Seq_date", "Health_state")) %>%
align(align = "center", part = "all")
research_ID | Country | Seq_date | Health_state | n |
|---|---|---|---|---|
1 | Belgium | 2,018 | Health | 46 |
2 | Italy | 36 | ||
3 | Poland | 2,023 | 70 | |
4 | Australia | 2,021 | 6 | |
Austria | 2,020 | 3 | ||
Germany | 33 | |||
2,021 | 12 | |||
Greece | 1 | |||
Hungary | 2 | |||
Ireland | Disease | 1 | ||
Israel | Health | 1 | ||
Italy | 1 | |||
Norway | 2,018 | Disease | 1 | |
UK | 7 | |||
2,020 | 2 | |||
2,021 | 3 | |||
USA | 2,018 | 6 | ||
2,020 | 1 | |||
2,021 | 5 | |||
2,022 | 1 | |||
2,023 | 1 | |||
5 | Austria | 2,017 | 25 | |
6 | Israel | 2,022 | 22 | |
7 | Spain | 2,015 | 95 |
library(gtsummary)
combined_info %>%
select(! c(patient_ID,
Diet_duration, Additive_usage, Diet_type #у всех здоровых данные переменные = NA
)) %>%
tbl_summary(digits = list(all_continuous() ~ c(0, 0),
all_categorical() ~ c(0, 0)),
by = Health_state) %>%
add_p()
| Characteristic | Disease, N = 1701 | Health, N = 2111 | p-value2 |
|---|---|---|---|
| research_ID | <0.001 | ||
| 1 | 0 (0%) | 46 (22%) | |
| 2 | 0 (0%) | 36 (17%) | |
| 3 | 0 (0%) | 70 (33%) | |
| 4 | 28 (16%) | 59 (28%) | |
| 5 | 25 (15%) | 0 (0%) | |
| 6 | 22 (13%) | 0 (0%) | |
| 7 | 95 (56%) | 0 (0%) | |
| Seq_region | <0.001 | ||
| V3-V4 | 0 (0%) | 106 (50%) | |
| V4 | 170 (100%) | 59 (28%) | |
| V5-V6 | 0 (0%) | 46 (22%) | |
| Seq_date | <0.001 | ||
| 2015 | 95 (56%) | 0 (0%) | |
| 2017 | 25 (15%) | 0 (0%) | |
| 2018 | 14 (8%) | 82 (39%) | |
| 2020 | 3 (2%) | 36 (17%) | |
| 2021 | 9 (5%) | 23 (11%) | |
| 2022 | 23 (14%) | 0 (0%) | |
| 2023 | 1 (1%) | 70 (33%) | |
| Age | 42 (32, 50) | 28 (26, 37) | <0.001 |
| Unknown | 95 | 152 | |
| Age_range | <0.001 | ||
| > 43 | 34 (45%) | 10 (5%) | |
| 16-43 | 41 (55%) | 201 (95%) | |
| Unknown | 95 | 0 | |
| Sex | 0.15 | ||
| female | 35 (48%) | 44 (37%) | |
| male | 38 (52%) | 74 (63%) | |
| Unknown | 97 | 93 | |
| Country | |||
| Australia | 0 (0%) | 6 (3%) | |
| Austria | 25 (15%) | 3 (1%) | |
| Belgium | 0 (0%) | 46 (22%) | |
| Germany | 0 (0%) | 45 (21%) | |
| Greece | 0 (0%) | 1 (0%) | |
| Hungary | 0 (0%) | 2 (1%) | |
| Ireland | 1 (1%) | 0 (0%) | |
| Israel | 22 (13%) | 1 (0%) | |
| Italy | 0 (0%) | 37 (18%) | |
| Norway | 1 (1%) | 0 (0%) | |
| Poland | 0 (0%) | 70 (33%) | |
| Spain | 95 (56%) | 0 (0%) | |
| UK | 12 (7%) | 0 (0%) | |
| USA | 14 (8%) | 0 (0%) | |
| Race | 0.4 | ||
| African American | 0 (0%) | 1 (1%) | |
| Asian or Pacific Islander | 0 (0%) | 1 (1%) | |
| Caucasian | 27 (100%) | 89 (86%) | |
| Hispanic | 0 (0%) | 1 (1%) | |
| Other | 0 (0%) | 11 (11%) | |
| Unknown | 143 | 108 | |
| Smoking | 4 (14%) | 12 (7%) | 0.2 |
| Unknown | 142 | 36 | |
| Alcohol | 0.002 | ||
| Never | 4 (14%) | 8 (14%) | |
| Occasionally (1-2 times/week) | 4 (14%) | 20 (34%) | |
| Rarely (a few times/month) | 9 (32%) | 27 (46%) | |
| Regularly (3-7 times/week) | 11 (39%) | 4 (7%) | |
| Unknown | 142 | 152 | |
| Antibiotics_usage | 0.020 | ||
| 1-6 months | 2 (8%) | 46 (30%) | |
| 12 months/Not use | 23 (92%) | 106 (70%) | |
| Unknown | 145 | 59 | |
| Physical_activity | 4 (14%) | 12 (7%) | 0.2 |
| Unknown | 142 | 36 | |
| Travel_period | 0.066 | ||
| 3 months | 3 (11%) | 15 (26%) | |
| 6 months | 1 (4%) | 9 (16%) | |
| Month | 4 (14%) | 9 (16%) | |
| Year | 20 (71%) | 25 (43%) | |
| Unknown | 142 | 153 | |
| Education_level | <0.001 | ||
| Bachelor's level | 5 (18%) | 26 (46%) | |
| Master's level | 18 (64%) | 10 (18%) | |
| School Level | 5 (18%) | 20 (36%) | |
| Unknown | 142 | 155 | |
| Cosmetics | 0.6 | ||
| Never | 15 (56%) | 29 (49%) | |
| Occasionally (a few-8 times/month) | 6 (22%) | 11 (19%) | |
| Regularly (3-7 times/week) | 6 (22%) | 19 (32%) | |
| Unknown | 143 | 152 | |
| Sleep_duration | 0.2 | ||
| > 8 hours | 4 (14%) | 5 (8%) | |
| 4-6 hours | 4 (14%) | 5 (8%) | |
| 6-7 hours | 12 (43%) | 19 (32%) | |
| 7-8 hours | 8 (29%) | 30 (51%) | |
| Unknown | 142 | 152 | |
| BMI_category | 0.012 | ||
| normal/overweight | 135 (96%) | 152 (100%) | |
| obese | 5 (4%) | 0 (0%) | |
| underweight | 1 (1%) | 0 (0%) | |
| Unknown | 29 | 59 | |
| 1 n (%); Median (IQR) | |||
| 2 Pearson’s Chi-squared test; Wilcoxon rank sum test; Fisher’s exact test | |||
bacteria_healthy) и пациентов с СРК
(bacteria_ibs) и их объединение в
combined_bacteriabacteria_healthy <- read_csv("data/raw/final_bacteria_health.csv")
bacteria_ibs <- read_csv("data/raw/final_bacteria_ibs_141.csv")
combined_bacteria <- bacteria_healthy %>%
bind_rows (bacteria_ibs) %>%
mutate(patient_ID = row_number())
rm (bacteria_healthy, bacteria_ibs)
combined_bacteria и
combined_info в data_widedata_wide <- combined_info %>%
left_join (combined_bacteria)
data_widedata_wide %>%
select (where (function(x) sum (is.na(x))/ nrow(data_wide) * 100 > 0)) %>%
sapply (function(x) sum (is.na(x))/ nrow(data_wide) * 100) %>% round(1) %>%
as.data.frame() %>%
rename(NA_percentage = ".") %>%
mutate (
"Number of people with known data" = round (nrow(data_wide) - NA_percentage/100 * nrow(data_wide)),
NA_percentage = paste (NA_percentage, "%", sep = " ")
) %>%
arrange(desc (NA_percentage)) %>%
rownames_to_column() %>%
as_tibble() %>% flextable()
rowname | NA_percentage | Number of people with known data |
|---|---|---|
Additive_usage | 97.1 % | 11 |
Diet_duration | 96.6 % | 13 |
Diet_type | 95.3 % | 18 |
Education_level | 78 % | 84 |
Travel_period | 77.4 % | 86 |
Cosmetics | 77.4 % | 86 |
Alcohol | 77.2 % | 87 |
Sleep_duration | 77.2 % | 87 |
Race | 65.9 % | 130 |
Age | 64.8 % | 134 |
Antibiotics_usage | 53.5 % | 177 |
Sex | 49.9 % | 191 |
Smoking | 46.7 % | 203 |
Physical_activity | 46.7 % | 203 |
Age_range | 24.9 % | 286 |
BMI_category | 23.1 % | 293 |
# Устанавливаем порог процента
threshold_percent <- 95
# Функция для вычисления процента записей, не равных NA и не равных 0, для каждой колонки
calculate_percentage <- function(col) {
sum(!is.na(col) & col != 0) / length(col) * 100
}
# Применяем функцию к каждой колонке в датасете
percentage_non_zero_non_na <- sapply(data_wide[, -1], calculate_percentage)
# Создаем датафрейм с результатами
result_df_sort <- data.frame(
column = names(percentage_non_zero_non_na),
percentage = round(100 - percentage_non_zero_non_na, 2) # percentage означает пропущенные или 0 значения
) %>%
arrange(desc(percentage))
# Отфильтровываем колонки, у которых процент записей менее threshold_percent%
filtered_columns <- result_df_sort[result_df_sort$percentage < threshold_percent, ]
# Сохраним датасет в excel для дальнейшего анализа
write.xlsx(filtered_columns,
file = "data/originals/percentage_by_vars.xlsx")
# Перезапись data_wide с выбором колонок с процентом NA/0 менее threshold_percent
data_wide <- data_wide %>%
select (row.names(filtered_columns), research_ID)
rm (calculate_percentage, result_df_sort)
patient_ID# Устанавливаем порог процента
threshold_percent <- 95
# Рассчитываем процент значений, не являющихся NA и не равных 0, для каждого пациента
percentage_non_zero_non_na <- rowMeans(!is.na(data_wide) & data_wide != 0, na.rm = TRUE) * 100
# Создаем датафрейм с результатами
result_df_sort <- data.frame(
patient_id = data_wide$patient_ID,
percentage = round(100 - percentage_non_zero_non_na, 2) # percentage означает пропущенные или 0 значения
) %>% arrange(desc(percentage))
# Отфильтровываем пациентов, у которых процент значений менее threshold_percent%
filtered_patients <- result_df_sort[result_df_sort$percentage < threshold_percent, ]
# Сохраним датасет в excel для дальнейшего анализа
write.xlsx(filtered_patients,
file = "data/originals/percentage_by_patient.xlsx")
# Перезапись data_wide с удалением строк с процентом NA/0 более threshold_percent
data_wide <- data_wide %>%
slice (filtered_patients$patient_id) #при threshold_percent = 95%, изменения data_wide не происходит, так как нет пациентов с процентом NA/0 более 95%
rm (percentage_non_zero_non_na, result_df_sort)
data_wide после удаления колонок и строк с
процентом NA/0 более threshold_percentdata_wide <- data_wide %>%
select (patient_ID, any_of (colnames(combined_info)), everything()) %>%
arrange(patient_ID)
write_rds(data_wide,
file = "data/originals/data_wide.rds")
data_wide в длинный формат
(data_long), загрузка bacterial_functions,
объединение data_long и
bacterial_functions# Создание датасета в длинном формате
data_long <- data_wide %>%
pivot_longer(ends_with(c("_D", "_P", "_O", "_C", "_F", "_G")),
names_to = "Taxon", values_to = "Percentage")
# Чтение листов Excel-файла с функциями бактерий и их объединение
path <- "data/raw/Bacterial group functions.xlsx"
taxon <- c ("TaxonName", "Rank")
bac_functions <- read_xlsx (path, 1) %>%
full_join(read_xlsx (path, 2), by = taxon) %>%
full_join(read_xlsx (path, 3), by = taxon) %>%
full_join(read_xlsx (path, 4), by = taxon) %>%
full_join(read_xlsx (path, 5), by = taxon) %>%
full_join(read_xlsx (path, 6), by = taxon) %>%
full_join(read_xlsx (path, 7), by = taxon) %>%
unite("Taxon", TaxonName, Rank, sep = "_") %>%
unite ("Habbit", Habbit, Habit_state, sep = "_") %>%
mutate (Habbit = sub ("NA_NA", NA, Habbit)) %>%
pivot_wider (names_from = Habbit, values_from = Habbit) %>%
mutate ("NA" = NULL) %>%
unite ("Neuromediator_destroy", Neuromediator, Destroy, sep = "_") %>%
mutate (Neuromediator_destroy = sub ("NA_NA", NA, Neuromediator_destroy)) %>%
pivot_wider(names_from = Neuromediator_destroy, values_from = Neuromediator_destroy) %>%
mutate ("NA" = NULL) %>%
pivot_wider (names_from = Vitamin, values_from = Vitamin) %>%
mutate ("NA" = NULL) %>%
unchop(everything()) %>%
unique() %>% #удаление повторяющихся строк
filter (Taxon != "Blautia obeum_S") %>% #для данного таксона противоречивая информация в Продуценты КЦЖК
mutate (across (
(3:31), function (x) ifelse (is.na(x), NA, 1))) %>%
mutate_all(as.factor)
rm (path, taxon)
#Перезапись data_long с добавлением функций бактерий
data_long <- data_long %>%
left_join (bac_functions, by = "Taxon")
#Сохранение data_long.rds
write_rds(data_long,
file = "data/originals/data_long.rds",
compress = "gz")
G_long <- data_long %>% subset(grepl("_G", Taxon))
# %>% select (where (function(x) sum (is.na(x))/ nrow(.) * 100 < threshold_percent))
F_long <- data_long %>% subset(grepl("_F", Taxon))
C_long <- data_long %>% subset(grepl("_C", Taxon))
O_long <- data_long %>% subset(grepl("_O", Taxon))
P_long <- data_long %>% subset(grepl("_P", Taxon))
write_rds(G_long,
file = "data/originals/G_long.rds", "gz")
write_rds(F_long,
file = "data/originals/F_long.rds", "gz")
write_rds(C_long,
file = "data/originals/C_long.rds", "gz")
write_rds(O_long,
file = "data/originals/O_long.rds", "gz")
write_rds(P_long,
file = "data/originals/P_long.rds", "gz")
combined_bacteria с по группе Health_state из
combined_info без учета таксономического уровня.# Создание нового датасета для сохранения результатов
result_dataset <- data.frame(
Variable_Name = character(),
Test_Type = character(),
P_Value = numeric(),
Normal_Distribution = character(),
stringsAsFactors = FALSE
)
alpha = 0.05
# Объединяем датасеты по patient_id
combined_data <- data_wide %>%
select(Health_state,
ends_with(c("_D", "_P", "_O", "_C", "_F", "_G")))
combined_bacteria_clean <- combined_data
# Получаем список переменных из датасета combined_bacteria, исключая "patient_ID"
values_to_exclude <- c("patient_ID", "Seq_date", "Age")
variable_names <- setdiff(names(combined_bacteria_clean), values_to_exclude)
# Проходим по каждой переменной
for (variable in variable_names) {
# Фильтрация данных (исключаем строки с NA и 0 в текущей переменной)
filtered_data <- combined_data[!is.na(combined_data[[variable]]) & combined_data[[variable]] != 0, ]
#print(variable)
#print(filtered_data)
# Проверим что датасет filtered_data не пустой и что количество групп сравнения более 1, в нашем случае их 2 :)
if (nrow(filtered_data) > 0 & length(unique(filtered_data$Health_state)) > 1) {
# Check if there is sufficient variability in the data
if (length(unique(filtered_data$Health_state)) > 1) {
# Проверка на нормальность
shapiro_test_result <- try(shapiro.test(filtered_data[[variable]]), silent = TRUE)
if (inherits(shapiro_test_result, "try-error")) {
warning(paste("Skipping Shapiro-Wilk test for variable", variable, "due to an error."))
next
}
p_value_shapiro <- shapiro_test_result$p.value
# Выбор соответствующего статистического теста
if (p_value_shapiro > 0.05) {
# Если нормальное распределение, провести дисперсионный анализ
model <- aov(filtered_data[[variable]] ~ Health_state, data = filtered_data)
summary_list <- summary(model)
test_type <- "ANOVA"
} else {
# Если не нормальное распределение, использовать тест Краскела-Уоллиса
tryCatch({
model <- kruskal.test(filtered_data[[variable]] ~ Health_state, data = filtered_data)
test_type <- "Kruskal-Wallis"
}, error = function(e) {
warning(paste("Skipping Kruskal-Wallis test for variable", variable, "due to an error:", conditionMessage(e)))
next
})
}
# Добавление результатов выбранного теста в датасет
result_dataset <- rbind(result_dataset,
data.frame(Variable_Name = variable,
Test_Type = test_type,
P_Value = ifelse(test_type == "ANOVA", summary_list[[1]]$"Pr(>F)"[1], model$p.value),
Normal_Distribution = ifelse(p_value_shapiro > 0.05, "Yes", "No")))
} else {
# If there is no variability, skip the tests
warning(paste("Skipping tests for variable", variable, "as there is not enough variability in the data."))
}
}}
## Warning: Skipping Shapiro-Wilk test for variable Health_state due to an error.
# Добавление столбца с поправкой Бонферрони
result_dataset$Adjusted_P_Value_Bonferroni <- p.adjust(result_dataset$P_Value, method = "bonferroni")
# Добавление столбца с поправкой Холма
result_dataset$Adjusted_P_Value_Holm <- p.adjust(result_dataset$P_Value, method = "holm")
# Добавление столбца с поправкой Бенджамини-Хохберга
result_dataset$Adjusted_P_Value_BH <- p.adjust(result_dataset$P_Value, method = "BH")
result_dataset$test_pass = ifelse(result_dataset$Adjusted_P_Value_Bonferroni < alpha & result_dataset$Adjusted_P_Value_Holm < alpha & result_dataset$Adjusted_P_Value_BH < alpha, "Y", "N")
result_dataset_pass <- result_dataset %>% filter(test_pass == "Y")
# Вывод результатов
print(result_dataset_pass)
## Variable_Name Test_Type P_Value
## 1 Eukaryota_D Kruskal-Wallis 3.641890e-07
## 2 Nitrospirota_P Kruskal-Wallis 7.764816e-07
## 3 Armatimonadota_P Kruskal-Wallis 5.555616e-10
## 4 Acidobacteriota_P Kruskal-Wallis 3.112329e-09
## 5 Patescibacteria_P Kruskal-Wallis 1.463219e-15
## 6 Cyanobacteria_P Kruskal-Wallis 6.542425e-06
## 7 Chloroflexi_P Kruskal-Wallis 6.316248e-23
## 8 Verrucomicrobiota_P Kruskal-Wallis 8.425194e-09
## 9 Actinobacteriota_P Kruskal-Wallis 6.600394e-23
## 10 Proteobacteria_P Kruskal-Wallis 2.752566e-06
## 11 Desulfitobacteriales_O Kruskal-Wallis 5.530423e-05
## 12 Pirellulales_O Kruskal-Wallis 3.196572e-09
## 13 Anaerolineales_O Kruskal-Wallis 3.746162e-09
## 14 Xanthomonadales_O Kruskal-Wallis 1.874109e-06
## 15 Pedosphaerales_O Kruskal-Wallis 1.302641e-15
## 16 Acholeplasmatales_O Kruskal-Wallis 1.552118e-08
## 17 SBR1031_O Kruskal-Wallis 3.378675e-19
## 18 Sphingobacteriales_O Kruskal-Wallis 5.724693e-10
## 19 Flavobacteriales_O Kruskal-Wallis 8.296231e-08
## 20 Staphylococcales_O Kruskal-Wallis 2.450391e-21
## 21 Monoglobales_O Kruskal-Wallis 2.153674e-06
## 22 Bifidobacteriales_O Kruskal-Wallis 1.699744e-15
## 23 Chitinophagales_O Kruskal-Wallis 5.210248e-17
## 24 Bacillales_O Kruskal-Wallis 7.441037e-10
## 25 Clostridia vadinBB60 group_O Kruskal-Wallis 9.747928e-08
## 26 Coriobacteriales_O Kruskal-Wallis 4.680932e-22
## 27 Clostridiales_O Kruskal-Wallis 2.333755e-14
## 28 Erysipelotrichales_O Kruskal-Wallis 7.511336e-10
## 29 Lactobacillales_O Kruskal-Wallis 9.824748e-11
## 30 Peptostreptococcales-Tissierellales_O Kruskal-Wallis 8.737004e-17
## 31 Oscillospirales_O Kruskal-Wallis 4.294684e-05
## 32 vadinHA49_C Kruskal-Wallis 4.885983e-06
## 33 Desulfitobacteriia_C Kruskal-Wallis 5.530423e-05
## 34 Microgenomatia_C Kruskal-Wallis 2.221509e-05
## 35 Dehalococcoidia_C Kruskal-Wallis 1.999580e-05
## 36 Phycisphaerae_C Kruskal-Wallis 3.006549e-07
## 37 Planctomycetes_C Kruskal-Wallis 4.398873e-05
## 38 Rhodothermia_C Kruskal-Wallis 5.134815e-09
## 39 Cyanobacteriia_C Kruskal-Wallis 2.599878e-06
## 40 Anaerolineae_C Kruskal-Wallis 1.126492e-19
## 41 Verrucomicrobiae_C Kruskal-Wallis 2.572550e-09
## 42 Coriobacteriia_C Kruskal-Wallis 4.373663e-22
## 43 Actinobacteria_C Kruskal-Wallis 1.748252e-19
## 44 Bacilli_C Kruskal-Wallis 6.906521e-12
## 45 Gammaproteobacteria_C Kruskal-Wallis 2.793530e-05
## 46 Pirellulaceae_F Kruskal-Wallis 3.196572e-09
## 47 Anaerolineaceae_F Kruskal-Wallis 3.746162e-09
## 48 Neisseriaceae_F Kruskal-Wallis 1.969734e-05
## 49 Microscillaceae_F Kruskal-Wallis 3.202746e-09
## 50 Pedosphaeraceae_F Kruskal-Wallis 1.302641e-15
## 51 A4b_F Kruskal-Wallis 5.664201e-18
## 52 Acholeplasmataceae_F Kruskal-Wallis 1.552118e-08
## 53 UCG-011_F Kruskal-Wallis 1.038745e-06
## 54 Selenomonadaceae_F Kruskal-Wallis 3.125086e-06
## 55 [Clostridium] methylpentosum group_F Kruskal-Wallis 2.364971e-18
## 56 Marinilabiliaceae_F Kruskal-Wallis 2.857166e-19
## 57 Flavobacteriaceae_F Kruskal-Wallis 1.274939e-05
## 58 Staphylococcaceae_F Kruskal-Wallis 4.246920e-22
## 59 Porphyromonadaceae_F Kruskal-Wallis 5.124256e-12
## 60 Eggerthellaceae_F Kruskal-Wallis 2.117931e-18
## 61 Barnesiellaceae_F Kruskal-Wallis 2.522551e-07
## 62 Monoglobaceae_F Kruskal-Wallis 2.153674e-06
## 63 Bifidobacteriaceae_F Kruskal-Wallis 1.699744e-15
## 64 Bacillaceae_F Kruskal-Wallis 1.838790e-09
## 65 Hungateiclostridiaceae_F Kruskal-Wallis 1.127118e-12
## 66 Marinifilaceae_F Kruskal-Wallis 7.717807e-11
## 67 Streptococcaceae_F Kruskal-Wallis 6.582098e-14
## 68 Peptostreptococcaceae_F Kruskal-Wallis 1.042119e-16
## 69 Erysipelatoclostridiaceae_F Kruskal-Wallis 3.073290e-13
## 70 Clostridiaceae_F Kruskal-Wallis 9.938201e-16
## 71 Erysipelotrichaceae_F Kruskal-Wallis 1.671406e-05
## 72 Tannerellaceae_F Kruskal-Wallis 4.258468e-05
## 73 [Eubacterium] coprostanoligenes group_F Kruskal-Wallis 2.001438e-06
## 74 Butyricicoccaceae_F Kruskal-Wallis 2.897180e-05
## 75 Muribaculaceae_F Kruskal-Wallis 3.758091e-05
## 76 Ruminococcaceae_F Kruskal-Wallis 1.458702e-08
## 77 Halobacillus_G Kruskal-Wallis 4.880366e-05
## 78 Megamonas_G Kruskal-Wallis 2.480778e-05
## 79 Hespellia_G Kruskal-Wallis 1.148969e-05
## 80 Lachnospiraceae NK4B4 group_G Kruskal-Wallis 3.406312e-05
## 81 XBB1006_G Kruskal-Wallis 4.544741e-06
## 82 Macellibacteroides_G Kruskal-Wallis 8.252963e-10
## 83 Paludibacter_G Kruskal-Wallis 7.121028e-06
## 84 Anaerovorax_G Kruskal-Wallis 3.805548e-08
## 85 Candidatus Phytoplasma_G Kruskal-Wallis 7.466909e-06
## 86 Selenomonas_G Kruskal-Wallis 1.912782e-05
## 87 Lachnospiraceae UCG-009_G Kruskal-Wallis 9.999884e-06
## 88 [Eubacterium] nodatum group_G Kruskal-Wallis 2.465934e-07
## 89 Mogibacterium_G Kruskal-Wallis 4.320301e-05
## 90 Papillibacter_G Kruskal-Wallis 4.897195e-06
## 91 Anaerococcus_G Kruskal-Wallis 3.272729e-12
## 92 CAG-56_G Kruskal-Wallis 9.881041e-07
## 93 Proteiniphilum_G Kruskal-Wallis 4.464993e-13
## 94 Ruminiclostridium_G Kruskal-Wallis 1.142188e-05
## 95 Syntrophococcus_G Kruskal-Wallis 2.488686e-12
## 96 Pseudobutyrivibrio_G Kruskal-Wallis 6.026604e-06
## 97 Butyricimonas_G Kruskal-Wallis 1.196336e-05
## 98 Caproiciproducens_G Kruskal-Wallis 1.550165e-05
## 99 Prevotellaceae UCG-004_G Kruskal-Wallis 4.891415e-14
## 100 UBA1819_G Kruskal-Wallis 1.981127e-19
## 101 Barnesiella_G Kruskal-Wallis 3.908806e-06
## 102 Erysipelotrichaceae UCG-003_G Kruskal-Wallis 6.623150e-10
## 103 Rikenellaceae RC9 gut group_G Kruskal-Wallis 3.864326e-16
## 104 Porphyromonas_G Kruskal-Wallis 1.759612e-14
## 105 Staphylococcus_G Kruskal-Wallis 1.471866e-23
## 106 A2_G Kruskal-Wallis 3.435145e-21
## 107 Lachnospiraceae NC2004 group_G Kruskal-Wallis 2.010754e-07
## 108 Bifidobacterium_G Kruskal-Wallis 1.343449e-14
## 109 Monoglobus_G Kruskal-Wallis 2.153674e-06
## 110 Lachnospiraceae XPB1014 group_G Kruskal-Wallis 8.116506e-06
## 111 Romboutsia_G Kruskal-Wallis 3.566692e-13
## 112 Odoribacter_G Kruskal-Wallis 9.910652e-11
## 113 Acetitomaculum_G Kruskal-Wallis 2.040740e-06
## 114 Tyzzerella_G Kruskal-Wallis 3.609278e-05
## 115 Clostridium sensu stricto 1_G Kruskal-Wallis 2.878502e-17
## 116 Marvinbryantia_G Kruskal-Wallis 1.915920e-11
## 117 NK4A214 group_G Kruskal-Wallis 1.881102e-08
## 118 Lachnospiraceae NK3A20 group_G Kruskal-Wallis 1.638832e-05
## 119 Streptococcus_G Kruskal-Wallis 6.665449e-13
## 120 Fusicatenibacter_G Kruskal-Wallis 1.087316e-07
## 121 Dorea_G Kruskal-Wallis 1.466599e-05
## 122 Anaerostipes_G Kruskal-Wallis 2.997463e-10
## 123 Subdoligranulum_G Kruskal-Wallis 5.234297e-15
## 124 Parabacteroides_G Kruskal-Wallis 3.893385e-05
## 125 [Ruminococcus] torques group_G Kruskal-Wallis 9.216209e-09
## 126 Faecalibacterium_G Kruskal-Wallis 7.571240e-10
## 127 Lachnoclostridium_G Kruskal-Wallis 1.729484e-07
## 128 Lachnospiraceae NK4A136 group_G Kruskal-Wallis 4.135324e-10
## 129 Agathobacter_G Kruskal-Wallis 4.325760e-06
## Normal_Distribution Adjusted_P_Value_Bonferroni Adjusted_P_Value_Holm
## 1 No 3.084681e-04 2.796972e-04
## 2 No 6.576800e-04 5.955614e-04
## 3 No 4.705606e-07 4.438937e-07
## 4 No 2.636142e-06 2.458740e-06
## 5 No 1.239347e-12 1.204229e-12
## 6 No 5.541434e-03 4.900276e-03
## 7 No 5.349862e-20 5.343545e-20
## 8 No 7.136139e-06 6.596927e-06
## 9 No 5.590534e-20 5.577333e-20
## 10 No 2.331423e-03 2.083692e-03
## 11 No 4.684268e-02 3.981905e-02
## 12 No 2.707496e-06 2.522095e-06
## 13 No 3.172999e-06 2.944483e-06
## 14 No 1.587370e-03 1.431819e-03
## 15 No 1.103337e-12 1.074679e-12
## 16 No 1.314644e-05 1.210652e-05
## 17 No 2.861738e-16 2.821194e-16
## 18 No 4.848815e-07 4.568305e-07
## 19 No 7.026907e-05 6.437875e-05
## 20 No 2.075481e-18 2.060779e-18
## 21 No 1.824162e-03 1.638946e-03
## 22 No 1.439683e-12 1.397190e-12
## 23 No 4.413080e-14 4.324506e-14
## 24 No 6.302558e-07 5.923065e-07
## 25 No 8.256495e-05 7.554644e-05
## 26 No 3.964750e-19 3.941345e-19
## 27 No 1.976691e-11 1.906678e-11
## 28 No 6.362102e-07 5.971512e-07
## 29 No 8.321561e-08 7.889272e-08
## 30 No 7.400242e-14 7.242976e-14
## 31 No 3.637598e-02 3.109352e-02
## 32 No 4.138427e-03 3.674259e-03
## 33 No 4.684268e-02 3.981905e-02
## 34 No 1.881618e-02 1.628366e-02
## 35 No 1.693644e-02 1.467692e-02
## 36 No 2.546547e-04 2.312036e-04
## 37 No 3.725845e-02 3.175986e-02
## 38 No 4.349188e-06 4.025695e-06
## 39 No 2.202096e-03 1.970707e-03
## 40 No 9.541384e-17 9.451265e-17
## 41 No 2.178950e-06 2.034887e-06
## 42 No 3.704493e-19 3.686998e-19
## 43 No 1.480770e-16 1.465035e-16
## 44 No 5.849823e-09 5.566656e-09
## 45 No 2.366120e-02 2.042070e-02
## 46 No 2.707496e-06 2.522095e-06
## 47 No 3.172999e-06 2.944483e-06
## 48 No 1.668365e-02 1.447755e-02
## 49 No 2.712726e-06 2.522095e-06
## 50 No 1.103337e-12 1.074679e-12
## 51 No 4.797578e-15 4.712615e-15
## 52 No 1.314644e-05 1.210652e-05
## 53 No 8.798173e-04 7.946401e-04
## 54 No 2.646948e-03 2.362565e-03
## 55 No 2.003131e-15 1.970021e-15
## 56 No 2.420020e-16 2.388591e-16
## 57 No 1.079873e-02 9.447299e-03
## 58 No 3.597141e-19 3.584400e-19
## 59 No 4.340245e-09 4.135275e-09
## 60 No 1.793888e-15 1.766355e-15
## 61 No 2.136601e-04 1.942364e-04
## 62 No 1.824162e-03 1.638946e-03
## 63 No 1.439683e-12 1.397190e-12
## 64 No 1.557456e-06 1.456322e-06
## 65 No 9.546691e-10 9.129658e-10
## 66 No 6.536983e-08 6.205117e-08
## 67 No 5.575037e-11 5.364410e-11
## 68 No 8.826749e-14 8.628746e-14
## 69 No 2.603076e-10 2.501658e-10
## 70 No 8.417656e-13 8.208954e-13
## 71 No 1.415681e-02 1.231826e-02
## 72 No 3.606922e-02 3.087389e-02
## 73 No 1.695218e-03 1.527098e-03
## 74 No 2.453911e-02 2.114941e-02
## 75 No 3.183103e-02 2.732132e-02
## 76 No 1.235521e-05 1.139246e-05
## 77 No 4.133670e-02 3.518744e-02
## 78 No 2.101219e-02 1.815930e-02
## 79 No 9.731768e-03 8.536841e-03
## 80 No 2.885146e-02 2.483201e-02
## 81 No 3.849396e-03 3.422190e-03
## 82 No 6.990259e-07 6.544600e-07
## 83 No 6.031510e-03 5.326529e-03
## 84 No 3.223299e-05 2.956911e-05
## 85 No 6.324472e-03 5.577781e-03
## 86 No 1.620126e-02 1.407808e-02
## 87 No 8.469902e-03 7.449913e-03
## 88 No 2.088646e-04 1.901235e-04
## 89 No 3.659295e-02 3.123578e-02
## 90 No 4.147924e-03 3.677793e-03
## 91 No 2.772002e-09 2.644365e-09
## 92 No 8.369242e-04 7.568877e-04
## 93 No 3.781849e-10 3.625574e-10
## 94 No 9.674336e-03 8.497882e-03
## 95 No 2.107917e-09 2.013347e-09
## 96 No 5.104534e-03 4.519953e-03
## 97 No 1.013297e-02 8.876815e-03
## 98 No 1.312990e-02 1.145572e-02
## 99 No 4.143029e-11 3.991395e-11
## 100 No 1.678015e-16 1.658204e-16
## 101 No 3.310759e-03 2.951148e-03
## 102 No 5.609808e-07 5.278651e-07
## 103 No 3.273084e-13 3.195798e-13
## 104 No 1.490392e-11 1.439363e-11
## 105 No 1.246671e-20 1.246671e-20
## 106 No 2.909568e-18 2.885522e-18
## 107 No 1.703109e-04 1.552302e-04
## 108 No 1.137902e-11 1.100285e-11
## 109 No 1.824162e-03 1.638946e-03
## 110 No 6.874681e-03 6.054913e-03
## 111 No 3.020988e-10 2.899721e-10
## 112 No 8.394322e-08 7.948343e-08
## 113 No 1.728507e-03 1.555044e-03
## 114 No 3.057059e-02 2.627555e-02
## 115 No 2.438091e-14 2.392035e-14
## 116 No 1.622785e-08 1.542316e-08
## 117 No 1.593293e-05 1.463497e-05
## 118 No 1.388090e-02 1.209458e-02
## 119 No 5.645635e-10 5.405679e-10
## 120 No 9.209569e-05 8.415828e-05
## 121 No 1.242210e-02 1.085283e-02
## 122 No 2.538851e-07 2.400967e-07
## 123 No 4.433449e-12 4.292123e-12
## 124 No 3.297697e-02 2.826597e-02
## 125 No 7.806129e-06 7.207076e-06
## 126 No 6.412840e-07 6.011565e-07
## 127 No 1.464873e-04 1.336891e-04
## 128 No 3.502620e-07 3.308260e-07
## 129 No 3.663919e-03 3.261623e-03
## Adjusted_P_Value_BH test_pass
## 1 3.855851e-06 Y
## 2 8.119506e-06 Y
## 3 9.603279e-09 Y
## 4 4.447092e-08 Y
## 5 4.957386e-14 Y
## 6 5.597408e-05 Y
## 7 1.863511e-20 Y
## 8 1.097868e-07 Y
## 9 1.863511e-20 Y
## 10 2.562004e-05 Y
## 11 3.631216e-04 Y
## 12 4.447092e-08 Y
## 13 5.036506e-08 Y
## 14 1.889727e-05 Y
## 15 4.597237e-14 Y
## 16 1.905281e-07 Y
## 17 2.201337e-17 Y
## 18 9.697631e-09 Y
## 19 9.759594e-07 Y
## 20 2.964973e-19 Y
## 21 2.049620e-05 Y
## 22 5.332161e-14 Y
## 23 2.451711e-15 Y
## 24 1.187563e-08 Y
## 25 1.131027e-06 Y
## 26 6.607916e-20 Y
## 27 6.376422e-13 Y
## 28 1.187563e-08 Y
## 29 1.824853e-09 Y
## 30 3.894864e-15 Y
## 31 2.927436e-04 Y
## 32 4.276210e-05 Y
## 33 3.631216e-04 Y
## 34 1.636190e-04 Y
## 35 1.485653e-04 Y
## 36 3.223477e-06 Y
## 37 2.957020e-04 Y
## 38 6.795606e-08 Y
## 39 2.446774e-05 Y
## 40 1.060154e-17 Y
## 41 3.822719e-08 Y
## 42 6.607916e-20 Y
## 43 1.480770e-17 Y
## 44 1.392815e-10 Y
## 45 2.022325e-04 Y
## 46 4.447092e-08 Y
## 47 5.036506e-08 Y
## 48 1.476429e-04 Y
## 49 4.447092e-08 Y
## 50 4.597237e-14 Y
## 51 2.998486e-16 Y
## 52 1.905281e-07 Y
## 53 1.060021e-05 Y
## 54 2.877117e-05 Y
## 55 1.335421e-16 Y
## 56 2.016683e-17 Y
## 57 1.009228e-04 Y
## 58 6.607916e-20 Y
## 59 1.058596e-10 Y
## 60 1.281348e-16 Y
## 61 2.739232e-06 Y
## 62 2.049620e-05 Y
## 63 5.332161e-14 Y
## 64 2.781171e-08 Y
## 65 2.512287e-11 Y
## 66 1.485678e-09 Y
## 67 1.689405e-12 Y
## 68 4.413374e-15 Y
## 69 7.656107e-12 Y
## 70 3.826207e-14 Y
## 71 1.275388e-04 Y
## 72 2.927436e-04 Y
## 73 1.994375e-05 Y
## 74 2.079586e-04 Y
## 75 2.630664e-04 Y
## 76 1.844061e-07 Y
## 77 3.254858e-04 Y
## 78 1.811396e-04 Y
## 79 9.268351e-05 Y
## 80 2.424492e-04 Y
## 81 4.051996e-05 Y
## 82 1.270956e-08 Y
## 83 6.031510e-05 Y
## 84 4.539858e-07 Y
## 85 6.261853e-05 Y
## 86 1.446541e-04 Y
## 87 8.223205e-05 Y
## 88 2.712527e-06 Y
## 89 2.927436e-04 Y
## 90 4.276210e-05 Y
## 91 6.930005e-11 Y
## 92 1.020639e-05 Y
## 93 1.050514e-11 Y
## 94 9.268351e-05 Y
## 95 5.404915e-11 Y
## 96 5.208708e-05 Y
## 97 9.559404e-05 Y
## 98 1.204578e-04 Y
## 99 1.294696e-12 Y
## 100 1.525468e-17 Y
## 101 3.559956e-05 Y
## 102 1.099962e-08 Y
## 103 1.558612e-14 Y
## 104 4.967972e-13 Y
## 105 1.246671e-20 Y
## 106 3.636960e-19 Y
## 107 2.240932e-06 Y
## 108 3.923799e-13 Y
## 109 2.049620e-05 Y
## 110 6.739883e-05 Y
## 111 8.631396e-12 Y
## 112 1.824853e-09 Y
## 113 2.009892e-05 Y
## 114 2.547549e-04 Y
## 115 1.434171e-15 Y
## 116 3.773918e-10 Y
## 117 2.276133e-07 Y
## 118 1.261900e-04 Y
## 119 1.525847e-11 Y
## 120 1.244536e-06 Y
## 121 1.150194e-04 Y
## 122 5.401810e-09 Y
## 123 1.583375e-13 Y
## 124 2.703030e-04 Y
## 125 1.182747e-07 Y
## 126 1.187563e-08 Y
## 127 1.953164e-06 Y
## 128 7.297125e-09 Y
## 129 3.897786e-05 Y
combined_bacteria_G <- combined_bacteria_clean %>%
select(Health_state, ends_with("_G"))
d <- dist(combined_bacteria_G) # euclidean distances between the rows
## Warning in dist(combined_bacteria_G): в результате преобразования созданы NA
fit <- cmdscale(d,eig=TRUE, k=2) # k is the number of dim
df_mds <- data.frame(
x = fit$points[,1],
y = fit$points[,2]
)
df_full <- cbind(df_mds, combined_info) %>% mutate(Health_state_n = case_when(Health_state == "Health" ~ 0,
Health_state == "Disease" ~ 1))
ggplot(df_full, aes(x = x, y = y, color = Health_state)) +
geom_point() +
theme_bw() +
ggtitle("Распределение вектора таксонов в зависимости от группы пациентов")
adonis2(d ~ Health_state_n, data = df_full)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = d ~ Health_state_n, data = df_full)
## Df SumOfSqs R2 F Pr(>F)
## Health_state_n 1 2594 0.02239 8.6805 0.001 ***
## Residual 379 113256 0.97761
## Total 380 115850 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Wilcox_comparison_round_0 <- data_wide %>%
select(Health_state, Archaea, Bacteria,
ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>%
mutate(across (where (is.numeric), function (x) round (x,0))) %>%
summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>%
pivot_longer(everything()) %>%
rename (Taxon = name, p_value = value) %>%
filter (p_value <= 0.05 ) %>%
arrange(p_value) %>%
add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>%
add_column(p_value_BH = p.adjust(.$p_value, "BH"))
rbind (
"Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_0),
"Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_0 %>% filter (p_value_holm <= 0.05 )),
"Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_0 %>% filter (p_value_BH <= 0.05 ))
)
## [,1]
## Количество значимо различающихся таксонов по p_value 141
## Количество значимо различающихся таксонов по p_value_holm 66
## Количество значимо различающихся таксонов по p_value_BH 141
Wilcox_comparison_round_1 <- data_wide %>%
select(Health_state, Archaea, Bacteria,
ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>%
mutate(across (where (is.numeric), function (x) round (x,1))) %>%
summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>%
pivot_longer(everything()) %>%
rename (Taxon = name, p_value = value) %>%
filter (p_value <= 0.05 ) %>%
arrange(p_value) %>%
add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>%
add_column(p_value_BH = p.adjust(.$p_value, "BH"))
rbind (
"Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_1),
"Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_1 %>% filter (p_value_holm <= 0.05 )),
"Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_1 %>% filter (p_value_BH <= 0.05 ))
)
## [,1]
## Количество значимо различающихся таксонов по p_value 319
## Количество значимо различающихся таксонов по p_value_holm 181
## Количество значимо различающихся таксонов по p_value_BH 319
Wilcox_comparison_round_2 <- data_wide %>%
select(Health_state, Archaea, Bacteria,
ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>%
mutate(across (where (is.numeric), function (x) round (x,2))) %>%
summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>%
pivot_longer(everything()) %>%
rename (Taxon = name, p_value = value) %>%
filter (p_value <= 0.05 ) %>%
arrange(p_value) %>%
add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>%
add_column(p_value_BH = p.adjust(.$p_value, "BH"))
rbind (
"Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_2),
"Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_2 %>% filter (p_value_holm <= 0.05 )),
"Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_2 %>% filter (p_value_BH <= 0.05 ))
)
## [,1]
## Количество значимо различающихся таксонов по p_value 646
## Количество значимо различающихся таксонов по p_value_holm 430
## Количество значимо различающихся таксонов по p_value_BH 646
# Получаем уникальные таксоны, удовлетворяющие условиям
unique_taxa <- Wilcox_comparison_round_2 %>%
subset(grepl("_G", Taxon)) %>%
filter(p_value_holm < 0.05) %>%
distinct(Taxon)
# Определяем размер шага (20 таксонов на каждой итерации)
step_size <- 18
# Разбиваем уникальные таксоны на группы по step_size
taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
# Проходим по группам и строим графики
for (i in seq_along(taxon_groups)) {
current_taxa <- taxon_groups[[i]]
G_long_filtered <- G_long %>%
filter(Taxon %in% current_taxa$Taxon & Percentage > 0) %>%
group_by(Health_state, Taxon) %>%
summarise(Count = n(), .groups = "drop")
bar_plot <- ggplot(G_long_filtered, aes(x = Health_state, y = Count, fill = Health_state == "Disease")) +
geom_bar(position = "dodge", color = "black", stat = "identity") +
scale_fill_manual(values = c("skyblue", "red"), guide = FALSE) +
labs(title = paste("Гистограмма для таксонов", (i - 1) * step_size + 1, "до", min(i * step_size, nrow(unique_taxa)), "в разрезе Health_state"),
x = "Статус пациента",
y = "Количество пациентов, у которых обнаружен данный таксон") +
coord_flip() +
facet_wrap(~Taxon, scales = "free_y", ncol = 2, shrink = 0.7) + # Уменьшение пространства между фасетами
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5), # Поворот текста на оси X и уменьшение шрифта
axis.text.y = element_text(hjust = 1, size = 5),
strip.text = element_text(size = 5)) # Уменьшение шрифта для названия фасет
print(bar_plot) # Отображение графика на экране
}
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## ℹ Please use "none" instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Создаем набор данных с ограничением на первые 10 таксонов с разными Health_state
unique_taxa <- Wilcox_comparison_round_2 %>%
subset(grepl("_G", Taxon)) %>%
filter(p_value_holm < 0.05) %>%
distinct(Taxon)
data <- G_long %>%
filter(Taxon %in% unique_taxa$Taxon & Percentage > 0) %>%
group_by(Taxon, Health_state) %>%
summarise(Count = n(), .groups = "drop") %>%
arrange(Taxon, Health_state) %>%
slice_head(n = 20)
# Add a unique id for each Taxon
data <- data %>%
mutate(id = group_indices(., Taxon))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `id = group_indices(., Taxon)`.
## Caused by warning:
## ! The `...` argument of `group_indices()` is deprecated as of dplyr 1.0.0.
## ℹ Please `group_by()` first
# Set a number of 'empty bar'
empty_bar <- 10
# Create empty rows with unique id for each Taxon
to_add <- data.frame(
Taxon = rep(unique(data$Taxon), each = empty_bar),
Health_state = rep(levels(data$Health_state), times = length(unique(data$Taxon))),
Count = rep(NA, empty_bar),
id = rep(seq_len(length(unique(data$Taxon))), each = empty_bar)
)
# Combine the initial dataset with the empty rows
data <- rbind(data, to_add) %>%
arrange(Taxon, Health_state)
# Reset the id values
data$id <- seq_len(nrow(data))
# Исправление: создание переменной value на основе Count
data$value <- data$Count
# Get the name and the y position of each label
label_data <- data
number_of_bar <- nrow(label_data)/2
angle <- 90 - 360 * (label_data$id-0.5) / (number_of_bar * 2)
label_data$hjust <- ifelse(angle < -90, 1, 0)
label_data$angle <- ifelse(angle < -90, angle+180, angle)
# Make the plot
p <- ggplot(data, aes(x=as.factor(id), y=value, fill=Health_state)) +
geom_bar(stat="identity", position="stack", alpha=0.7) +
scale_fill_manual(values = c("skyblue", "red")) + # Используем только два цвета
ylim(c(0, max(data$value) + 10)) +
theme_minimal() +
theme(
axis.text = element_blank(),
axis.title = element_blank(),
panel.grid = element_blank(),
plot.margin = unit(rep(-1,4), "cm")
) +
coord_polar(start = 0) +
geom_text(data=label_data, aes(x=id, y=value+5, label=Taxon, hjust=hjust), color="black", fontface="bold", alpha=0.6, size=2.5, angle=label_data$angle, inherit.aes = FALSE ) +
geom_text(aes(x=as.factor(id), y=value, label=Count), position=position_stack(vjust=0.5), size=2.5, color="black") + # Добавление количества наблюдений
labs(title = "График 'солнышко' для первых 10 пар таксонов с разными Health_state")
print(p)
## Warning: Removed 110 rows containing missing values (`position_stack()`).
## Warning: Removed 110 rows containing missing values (`position_stack()`).
## Warning: Removed 110 rows containing missing values (`geom_text()`).
# Определяем размер шага (4 таксона на каждой итерации)
step_size <- 4
# Разбиваем уникальные таксоны на группы по step_size
taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
# Проходим по группам и строим боксплоты для переменной Age
for (i in seq_along(taxon_groups)) {
current_taxa <- taxon_groups[[i]]
G_long_filtered <- G_long %>%
filter(Taxon %in% current_taxa$Taxon & Percentage > 0)
# Строим боксплот
box_plot <- ggplot(G_long_filtered, aes(x = Health_state, y = Age, fill = Health_state)) +
geom_boxplot() +
labs(title = paste("Боксплоты для таксонов по статусу"),
x = "Health_state",
y = "Age") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8), # Уменьшаем размер шрифта для подписей Taxon
axis.title.x = element_blank(), # Убираем название оси X
legend.position = "bottom") + # Перемещаем легенду вниз
scale_fill_manual(values = c("red", "skyblue")) +
facet_grid(~Taxon, scales = "free_y", space = "free") # Используем facet_grid вместо facet_wrap
# Отображаем график на экране
print(box_plot)
}
# Определяем размер шага (3 таксона на каждой итерации)
step_size <- 3
# Разбиваем уникальные таксоны на группы по step_size
taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
# Проходим по группам и строим боксплоты для переменной Age
for (i in seq_along(taxon_groups)) {
current_taxa <- taxon_groups[[i]]
G_long_filtered <- G_long %>%
filter(Taxon %in% current_taxa$Taxon & Percentage > 0)
# Строим боксплот
box_plot <- ggplot(G_long_filtered, aes(x = interaction(Health_state, Sex), y = Age, fill = Health_state)) +
geom_boxplot() +
labs(title = paste("Боксплоты для таксона (Пол + статус заболевания)"),
x = "Health_state + Sex",
y = "Age") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8), # Уменьшаем размер шрифта для подписей Taxon
axis.title.x = element_blank(), # Убираем название оси X
legend.position = "bottom") + # Перемещаем легенду вниз
scale_fill_manual(values = c("red", "skyblue")) +
facet_grid(~Taxon, scales = "free_y", space = "free") # Используем facet_grid вместо facet_wrap
# Отображаем график на экране
print(box_plot)
}
# Определяем размер шага (3 таксона на каждой итерации)
step_size <- 3
# Разбиваем уникальные таксоны на группы по step_size
taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
# Проходим по группам и строим боксплоты для переменной Age
for (i in seq_along(taxon_groups)) {
current_taxa <- taxon_groups[[i]]
G_long_filtered <- G_long %>%
filter(Taxon %in% current_taxa$Taxon & Percentage > 0)
# Строим боксплот
box_plot <- ggplot(G_long_filtered, aes(x = interaction(Health_state, Smoking), y = Age, fill = Health_state)) +
geom_boxplot() +
labs(title = paste("Боксплоты для таксона (Статус курения + статус заболевания)"),
x = "Health_state + Smoking",
y = "Age") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8), # Уменьшаем размер шрифта для подписей Taxon
axis.title.x = element_blank(), # Убираем название оси X
legend.position = "bottom") + # Перемещаем легенду вниз
scale_fill_manual(values = c("red", "skyblue")) +
facet_grid(~Taxon, scales = "free_y", space = "free") # Используем facet_grid вместо facet_wrap
# Отображаем график на экране
print(box_plot)
}
# Определяем размер шага (3 таксона на каждой итерации)
step_size <- 3
# Разбиваем уникальные таксоны на группы по step_size
taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
# Проходим по группам и строим боксплоты для переменной Age
for (i in seq_along(taxon_groups)) {
current_taxa <- taxon_groups[[i]]
G_long_filtered <- G_long %>%
filter(Taxon %in% current_taxa$Taxon & Percentage > 0)
# Строим боксплот
box_plot <- ggplot(G_long_filtered, aes(x = interaction(Health_state, Alcohol), y = Age, fill = Health_state)) +
geom_boxplot() +
labs(title = paste("Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)"),
x = "Health_state + Alcohol",
y = "Age") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8), # Уменьшаем размер шрифта для подписей Taxon
axis.title.x = element_blank(), # Убираем название оси X
legend.position = "bottom") + # Перемещаем легенду вниз
scale_fill_manual(values = c("red", "skyblue")) +
facet_grid(~Taxon, scales = "free_y", space = "free") # Используем facet_grid вместо facet_wrap
# Отображаем график на экране
print(box_plot)
}